Introduction

Description and Simulation In this working sheet, I will create an artificial data set, Age of population, with a composition technique, In otherword, I’d create six sample sets of population with a different mean of age to portray each interval age, and after that, elaborating the data with descriptive statistic, enabling us to understand the data from different perspective

Construct a Data Set

import pandas as pd
import numpy as np
import seaborn as sns
import random
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
#import chart_studio.plotly as py

"""np.random.normal o generate a vector of random values that follow a normal distribution 
with a specific mean and standard deviation: mean, sd, size """ 

#Note: Because making up the data by implementing normoal distribution so some obeservation is negative
#but age can't be negative, need to handle that later
random.seed(112)
a = np.random.normal(loc=5, scale=7, size=2000)
b = np.random.normal(loc=5, scale=7, size=2000)
c = np.random.normal(loc=35, scale=5, size=2500)
d = np.random.normal(loc=50, scale=8, size=3000)
e = np.random.normal(loc=70, scale=5, size=1000)
f = np.random.normal(loc=80, scale=7, size=1000)

#combine all the vector
pop = np.concatenate((a, b, c, d, e, f))

pop.shape

(11500,)

# making all data become absolute value and casting the type to int
pop = np.absolute(pop).astype(int)

# exclud age = 0
pop = pop[pop != 0]

pop

array([ 8,  4,  9, ..., 75, 87, 63])

# numbers of obeservation decreases not significantly, so it should be fine
pop.shape

(11136,)

df = pd.DataFrame(pop, columns = ['Age'])

sex = ['Male','Female']

#random.choice() is a function using to pick a random value from a list
print(random.choice(sex))

Female

df['Sex'] = random.choice(sex) # need to fill up an attribute with some values first
df['Sex'] = [ random.choice(sex) for i in df['Sex']  ] #apply random choice with list expressions

df

	Age	Sex
0	8	Female
1	4	Female
2	9	Male
3	1	Female
4	6	Male
...	...	...
11131	67	Male
11132	73	Female
11133	75	Female
11134	87	Male
11135	63	Female

11136 rows × 2 columns

df

	Age	Sex
0	8	Female
1	4	Female
2	9	Male
3	1	Female
4	6	Male
...	...	...
11131	67	Male
11132	73	Female
11133	75	Female
11134	87	Male
11135	63	Female

11136 rows × 2 columns

df.groupby(['Sex']).count()

	Age
Sex
Female	5524
Male	5612

Explore data wiht some visualizations

#Prepare the data
"""
I want to make a pyramid population catagorized by gender aging interval. First step is that 
I might  need  to put each person into different bins depending on their age
"""

'\nI want to make a pyramid population catagorized by gender aging interval. First step is that \nI might  need  to put each person into different bins depending on their age\n'

# create the age_interval with 5 bins
df.loc[df['Age'].between(1, 20, 'both'), 'Age_Interval'] = '1-20'
df.loc[df['Age'].between(20, 40, 'right'), 'Age_Interval'] = '21-40'
df.loc[df['Age'].between(40, 60, 'right'), 'Age_Interval'] = '41-60'
df.loc[df['Age'].between(60, 80, 'right'), 'Age_Interval'] = '61-80'
df.loc[df['Age'].between(80, 100, 'right'), 'Age_Interval'] = '81-100'

#Credit: https://medium.com/towards-data-science/how-to-bin-numerical-data-with-pandas-fe5146c9dc55

df

	Age	Sex	Age_Interval
0	8	Female	1-20
1	4	Female	1-20
2	9	Male	1-20
3	1	Female	1-20
4	6	Male	1-20
...	...	...	...
11131	67	Male	61-80
11132	73	Female	61-80
11133	75	Female	61-80
11134	87	Male	81-100
11135	63	Female	61-80

11136 rows × 3 columns

df1 = df.groupby(['Age_Interval','Sex'])[['Age']].count().reset_index().rename(columns={'Age':'Number_of_Pop'})

#Noticing that the first attribute is repetitive, this is long format so we need to convert them to wide format
#For analysis purposes, mostly we want wide format, but ploting graphg by R or some analytical tool might require long format
df1

	Age_Interval	Sex	Number_of_Pop
0	1-20	Female	1812
1	1-20	Male	1775
2	21-40	Female	1295
3	21-40	Male	1379
4	41-60	Female	1314
5	41-60	Male	1334
6	61-80	Female	890
7	61-80	Male	886
8	81-100	Female	213
9	81-100	Male	238

#pivot method is used to convert from long to wide
df2 =pd.pivot(df1,index='Age_Interval' ,columns='Sex', values='Number_of_Pop')

#Credit: https://towardsdatascience.com/reshaping-a-pandas-dataframe-long-to-wide-and-vice-versa-517c7f0995ad

df2

Sex	Female	Male
Age_Interval
1-20	1812	1775
21-40	1295	1379
41-60	1314	1334
61-80	890	886
81-100	213	238

df2['Female'].dtype

dtype('int64')

women_bins = [i*-1 for i in df2['Female']]

len(women_bins)



women_bins = np.array(women_bins)
men_bins = np.array(df2['Male'])

df3 = df2.reset_index() # dropping Age_Interval from being an index 
y =list(df3['Age_Interval']) # convert to list and utilize it as YAxis

layout = go.Layout(yaxis=go.layout.YAxis(title='Age'),
                   xaxis=go.layout.XAxis(
                       range=[-2200, 2200],
                       tickvals=[-2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000],
                       ticktext=[2000, 1500, 1000, 500, 0, 500, 1000, 1500, 2000],
                       title='Number_of_Population'),
                   barmode='overlay',
                   bargap=0.1)

data = [go.Bar(y=y,
               x=men_bins,
               orientation='h',
               name='Men',
               hoverinfo='x',
               marker=dict(color='powderblue')
               ),
        go.Bar(y=y,
               x=women_bins,
               orientation='h',
               name='Women',
               text=-1 * women_bins.astype('int'),
               hoverinfo='text',
               marker=dict(color='seagreen')
               )]

py.iplot(dict(data=data, layout=layout), filename='EXAMPLES/bar_pyramid')

Pyramid graph above giving an overall sense of population grouped by aga interval and sexuality. We can see that majority of population is in the range of 1-20 yeas old.

Measure of Central Tendency And Dispsersion of data

Now, I want to explore the central tendency in age of population.Thus, backing to work with data before catogorizing them intp different bin. So as to gain better understanding, I’d show some calculation to get descriptive statistic before using function to get those result

pop # age of individuals, population

array([ 8,  4,  9, ..., 75, 87, 63])

pop.shape # number of populaiton

(11136,)

mean_pop = round(sum(pop)/len(pop))
mean_pop
# Average age of population is 37 years old

np.median(pop)
# Median of pop is 37

37.0

# defining a function to calculate mode. It
# takes list variable as argument
def mode(lst):
     
    # creating a dictionary
    freq = {}
    for i in lst:
       
        # mapping each value of list to a
        # dictionary
        freq.setdefault(i, 0)
        freq[i] += 1
         
    # finding maximum value of dictionary
    hf = max(freq.values())
     
    # creating an empty list
    hflst = []
     
    # using for loop we are checking for most
    # repeated value
    for i, j in freq.items():
        if j == hf:
            hflst.append(i)
             
    # returning the result
    return hflst
 
# calling mode() function and passing list
# as argument
print(mode(pop))

#Credit: https://www.geeksforgeeks.org/how-to-calculate-the-mode-of-numpy-array/

[1]

pop

array([ 8,  4,  9, ..., 75, 87, 63])

#Observing the diispersion of the data by geting the deviation
#With that, we substract each element from the mean
dev = []
for i in pop:
    temp = i-mean_pop
    dev.append(temp)

#deviation of mean
dev = np.array(dev)
dev

array([-29, -33, -28, ...,  38,  50,  26])

np.mean(dev) # the mean of deviation is usually be zero

-0.20510057471264367

#absolute deviation of mean
dev = abs(dev)
dev

array([29, 33, 28, ..., 38, 50, 26])

#Mean Absolute deviation or 'MAD'
np.mean(dev)
print("Mean Absolute deviation is % s "
                % (np.mean(dev)))

Mean Absolute deviation is 20.97162356321839

import statistics
pop_list = pop.tolist()
print("Standard Deviation of sample is % s "
                % (statistics.stdev(pop_list)))

Standard Deviation of sample is 25.072255798045703

Mean absolute deviation (MAD) is a measure of the average absolute distance between each data value and the mean of a data set. Similar to standard deviation, MAD is a parameter or statistic that measures the spread, or variation, in your data.

Even Both MAD and SD measuring the spread of data,but SD is usually bigger than MAD as SD more sensitive to values that are farther away from the mean for more detail on MAD and SD: https://articles.outlier.org/mean-absolute-deviation-meaning

Describing Dispersion

#Using describe to see basic describtive measure ment
df['Age'].describe()

count    11136.000000
mean        36.794899
std         25.072256
min          1.000000
25%         11.000000
50%         37.000000
75%         55.000000
max        100.000000
Name: Age, dtype: float64

#Range
range = max(df['Age'])-min(df['Age'])
print('Range is %s'
      %range)

Range is 99

#Interquartile range = Q3 – Q1
q1, q3 = np.percentile(df.Age,[25,75]) 
iqr = q3 - q1
print(iqr)

44.0

Detecting Outlier

definition of outlier here is any point of data whihc is beyond the line of lower limit(Q1 - 1.5IQR) or upper limit(Q3 + 1.5IQR)

#firn lower limit and upper limit
lower_limit = q1 - (1.5*iqr)
upper_limit = q3 + (1.5*iqr)
print(lower_limit, upper_limit)

# this could be conclude that if anyone in our population is, at age, more than 120 years old could be considered as outlier

-55.0 121.0

# try selecting a sample set and consider its statistic measurement
age_list = df.Age.tolist()
sample = random.sample(age_list,2500)

sample = pd.DataFrame(sample)
sample.describe()

	0
count	2500.000000
mean	36.736800
std	24.606489
min	1.000000
25%	11.000000
50%	37.000000
75%	54.000000
max	94.000000

df['Age'].describe()

count    11136.000000
mean        36.794899
std         25.072256
min          1.000000
25%         11.000000
50%         37.000000
75%         55.000000
max        100.000000
Name: Age, dtype: float64

# the statistical measurement of sample set and population are quite similar
# the sample set well represent the population

df

	Age	Sex	Age_Interval
0	8	Female	1-20
1	4	Female	1-20
2	9	Male	1-20
3	1	Female	1-20
4	6	Male	1-20
...	...	...	...
11131	67	Male	61-80
11132	73	Female	61-80
11133	75	Female	61-80
11134	87	Male	81-100
11135	63	Female	61-80

11136 rows × 3 columns

# performing visualization of population using boxplot, categotized by sexuality

df_box = df
fig = px.box(df_box, x="Sex", y="Age")
fig.show()

# from out artificial data here, I'd try adding 'country attribute', and making more dynamic visualization
df

	Age	Sex	Age_Interval
0	8	Female	1-20
1	4	Female	1-20
2	9	Male	1-20
3	1	Female	1-20
4	6	Male	1-20
...	...	...	...
11131	67	Male	61-80
11132	73	Female	61-80
11133	75	Female	61-80
11134	87	Male	81-100
11135	63	Female	61-80

11136 rows × 3 columns

country = ['Thailand', 'Taiwan', 'Japan', 'Germany']

df['Country'] = random.choice(country) 
df['Country'] = [ random.choice(country) for i in df['Country'] ]

df

	Age	Sex	Age_Interval	Country
0	8	Female	1-20	Germany
1	4	Female	1-20	Japan
2	9	Male	1-20	Japan
3	1	Female	1-20	Germany
4	6	Male	1-20	Japan
...	...	...	...	...
11131	67	Male	61-80	Taiwan
11132	73	Female	61-80	Japan
11133	75	Female	61-80	Japan
11134	87	Male	81-100	Thailand
11135	63	Female	61-80	Thailand

11136 rows × 4 columns

from dash import Dash, dcc, html, Input, Output
from jupyter_dash import JupyterDash

app = JupyterDash(__name__)


app.layout = html.Div([
    html.H4("Analysis of Age Distribution in Population"),
    html.P("x-axis:"),
    dcc.Checklist(
        id='x-axis', 
        options=['Country', 'Sex'], 
        inline=True
    ),
    html.P("y-axis:"),
    dcc.RadioItems(
        id='y-axis', 
        value='Age', 
        inline=True
    ),
    dcc.Graph(id="graph"),
])


@app.callback(
    Output("graph", "figure"), 
    Input("x-axis", "value"), 
    Input("y-axis", "value"))
def generate_chart(x, y):
    df_box = df # replace with your own data source
    fig = px.box(df, x=x, y=y)
    return fig

if __name__ == '__main__':
    app.run_server(mode="inline")